# -*- coding: utf-8 -*-
"""
------------------------------------------------------------------------------
    File Name:  requests_samp
    Author   :  wanwei1029
    Date     :  2018/5/7
    Desc     :官方文档：http://docs.python-requests.org/en/master/
    request https证书问题，参考：
    https://www.cnblogs.com/fh-fendou/p/7479812.html
------------------------------------------------------------------------------
"""
import requests
import re
import urllib.parse as urlparse

from bs4 import BeautifulSoup


def samp():
    """
    """
    url = 'https://www.zhihu.com/'
    url = "http://www.aikantxt.la/aikan18681/"
    url = "https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    print(response.encoding)
    response.encoding = "utf-8"
    print(response.status_code)
    soup = BeautifulSoup(response.text, "lxml")
    # print(soup.prettify())
    links = soup.find_all('a', href=re.compile("^/item/"))
    print(len(links))
    for link in links:
        print(type(link))
        print(link["href"])
        print(urlparse.urljoin(url,link["href"] ))


if __name__ == '__main__':
    test_method = "samp"
    if test_method == "samp":
        samp()